
import pandas as pd
import requests
import re
import time

data_list = []
for i in range(1, 20):
    print("正在爬取第" + str(i) + "页")
    # 构建访问的网址，这个网址可有讲究了
    first = 'https://rate.tmall.com/list_detail_rate.htm?itemId=596452219968&spuId=1240258038&sellerId=1579115485&order=3&currentPage=1'
    last = '&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvB9vnvPgvUvCkvvvvvjiPn25pQjlhPFSv0jthPmPy6jiPR2MwAjnjRLF9gjlERphvCvvvphmjvpvhvUCvp8wCvvpvvhHhmphvLvUIUkUaQCAwe1O0747BhCka%2BoHoDOvfjLeAnhjEKBmAdXIaUExreTgcnkxb5ah6Hd8ram56D40OdiUDNrBlHd8reC69D70fd3J18heivpvUvvCCWUB0wV0EvpvVvpCmpJ2vKphv8vvvpHwvvvvvvvCmqvvvv4pvvhZLvvmCvvvvBBWvvvjwvvCHhQvvvxQCvpvVvUCvpvvv2QhvCvvvMMGtvpvhvvCvp86CvChh9P2s3QvvC0ODj6KHkoVQROhCvCLwMbra3rMwznsJWxS5gn1Uzvr4486Cvvyv9mQS7Qvvm4p%3D&needFold=0&_ksTS=1585406932472_453&callback=jsonp454'
    url = first + str(i) + last
    # 访问的头文件，还带这个cookie
    headers = {
        # 用的哪个浏览器
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
        # 从哪个页面发出的数据申请，每个网站可能略有不同
        'referer': 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.464b6bbfQwJmpT&id=596452219968&skuId=4313616443848&areaId=340700&user_id=1579115485&cat_id=2&is_b=1&rn=2aaf4f3d019121cb4b9c1816fe2eb360',
        # 哪个用户想要看数据，是游客还是注册用户,建议使用登录后的cookie
        'cookie': 
		}
    # 尝试获取数据（这里的数据应该是从json里面获取的）
    try:
        data = requests.get(url, headers=headers).text
        time.sleep(10)
        result = re.findall('rateContent":"(.*?)"fromMall"', data)
        print(result)
        data_list.extend(result)
    except:
        print("本页爬取失败")

df = pd.DataFrame()
df["review"] = data_list
df.to_csv("../data/coms.csv", mode="a+", header=None, index=None, encoding="utf-8")
